---
title: "Online Appendix: The other side of platform politics. Law-making and online participatory democracy in the Five Star Movement \\ Swiss Political Science Review"
author: "Filippo Tronconi (University of Bologna) and Francesco Bailo (University of Sydney) "
date: "2024-07-10"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
library(tidyverse)
library(dplyr)
library(readxl)
library(kableExtra)
library(cowplot)
library(stargazer)
```

\newpage

# Regressions analysis

```{r load}
corpus.df <- 
  readRDS("corpus_raw.rds") %>%
  dplyr::filter(!grepl("rouss", doc_id, ignore.case = T))

corpus_pdf.tidy <- 
  read.csv("corpus_pdf.tidy.csv")

corpus_docx.tidy <- 
  read.csv("corpus_docx.tidy.csv")

# LexRousseau

Dataset_unico_250622 <- 
  read_excel("Dataset_unico_250622.xlsx")

cos_sim_results.df <- read.csv("cos_sim_results.csv")

cos_sim_results.token <- 
  read.csv("cos_sim_results.token.csv")

```

```{r}

doc_summary_stats.df <- 
  cos_sim_results.df %>%
  dplyr::group_by(doc_id, i) %>%
  dplyr::summarise(max_cos_sim = max(score)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(doc_id) %>%
  dplyr::summarise(sentences = n(),
                   median_sim = median(max_cos_sim),
                   mean_sim = mean(max_cos_sim),
                   high_sim_n = sum(max_cos_sim > .95),
                   high_sim_perc = sum(max_cos_sim > .95) / n() * 100) %>%
  dplyr::inner_join(Dataset_unico_250622, by = c(doc_id = "ID_n"))

doc_summary_stats.df$token_sim <- 
  cos_sim_results.token$similarity[
    match(doc_summary_stats.df$doc_id,
          as.numeric(gsub(".pdf",
                          "",
                          cos_sim_results.token$item1)))] 

doc_summary_stats.df <- 
  doc_summary_stats.df %>%
  dplyr::mutate(Partito.n = factor(case_when(
    Partito %in% c("M5S", "M5S, On. Azzurra Pia Maria Cancelleri", 
                   "M5S, On. Paolo Bernini") ~ "M5S only",
    Partito %in% "M5S e altri" ~ "M5S e altri"),
    levels = c("M5S e altri", "M5S only")))

doc_summary_stats.df$date <- 
  as.Date(as.numeric(doc_summary_stats.df$`Data presentazione`),
          origin = "1899-12-30")

doc_summary_stats.df$date[
  doc_summary_stats.df$`Data presentazione` == "14 gennaio 2016"] <-
  as.Date("2016-01-14")

doc_summary_stats.df$assegnato_non_iniziato <- 
  doc_summary_stats.df$Esito %in% "Assegnato (non ancora iniziato l'esame)"

doc_summary_stats.df$bottom_5pct <- 
  doc_summary_stats.df$mean_sim <= quantile(doc_summary_stats.df$mean_sim, p = .05)

doc_summary_stats.df$bottom_10pct <- 
  doc_summary_stats.df$mean_sim <= quantile(doc_summary_stats.df$mean_sim, p = .1)

```

```{r}
doc_summary_stats.df$time_period.n <- 
  factor(dplyr::case_when(doc_summary_stats.df$date <= as.Date("2018-03-22") ~ "XVII leg.",
                   doc_summary_stats.df$date > as.Date("2018-03-22") &
                      doc_summary_stats.df$date <=  as.Date("2019-09-05") ~ "XVIII leg. - Conte I",
                   doc_summary_stats.df$date >  as.Date("2019-09-05") ~ "XVIII leg. - Conte II"),
         ordered = F,
         levels = c("XVII leg.",  
                    "XVIII leg. - Conte I",
                    "XVIII leg. - Conte II"))
```

```{r}
fit1 <- 
  lm(mean_sim ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, 
     data = doc_summary_stats.df)

fit1a <- 
  glm(bottom_5pct ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, family = "binomial", 
     data = doc_summary_stats.df)

fit1b <- 
  glm(bottom_10pct ~ 
       Partito.n +
       sentences + 
       `n commenti` + 
       `n risposte firmatario` +
       time_period.n, family = "binomial", 
     data = doc_summary_stats.df)

fit2 <- 
  lm(token_sim ~ 
       Partito.n +
       `n commenti` + Legislatura + 
       `n risposte firmatario` +
       as.numeric(`Data presentazione`) +
       log(`n followers twitter`), data = doc_summary_stats.df)
```

```{r results = 'asis'}
stargazer(fit1, fit1a, fit1b, title = "Regressions predicting document-level similarity score and whether a document is among the least similar documents in the corpus", header = FALSE,
          covariate.labels = c("FMS only among presenters",
                               "sentences",
                               "n. comments",
                               "n. responses by proposing MP",
                               "XVIII leg. - Conte I",
                               "XVIII leg. - Conte II",
                               "Constant"),
          dep.var.labels = c("Mean cosine sim.", "Bottom 5 perc.", "Bottom 10 perc."))
```

\newpage

# Cosine similarity: Examples

```{r}

cos_sim_results_full_text <- 
  read.csv("cos_sim_results_full_text.csv") %>%
  dplyr::mutate(level = cut(score, breaks = 8))

```

## High similarity only (>.95)

```{r results = 'asis'}

cos_sim_results_full_text %>%
  dplyr::filter(score>.95) %>%
  sample_n(8) %>%
  select(-level) %>%
  kbl(digits=2, format = 'latex', booktabs=T, longtable = T) %>%
  column_spec(2, width = "20em") %>%
  column_spec(3, width = "20em") %>%
  kable_styling(latex_options = c("striped", "repeat_header"))
```

## All similarity levels

```{r results = 'asis'}

cos_sim_results_full_text %>%
  group_by(level) %>%
  slice_sample(n = 2) %>%
  dplyr::ungroup() %>%
  select(-level) %>%
  kbl(digits=2, format = 'latex', booktabs=T, longtable = T) %>%
  column_spec(2, width = "20em") %>%
  column_spec(3, width = "20em") %>%
  kable_styling(latex_options = c("striped", "repeat_header"))
```



